We can also do cross-validation as a means of measuring out-of-sample prediction performance (see https://arxiv.org/pdf/2210.04482). LOOCV cannot measure out-of-sample prediction performance for structured models like spatial, temporal, longitudinal models.
##From Day2 files
if (T){
nc.sids <- st_read(system.file("shapes/sids.shp", package="spData"))
p_hat <- sum(nc.sids$SID74) / sum(nc.sids$BIR74)
nc.sids$E74 <- p_hat * nc.sids$BIR74
nc.sids$nwp_hat74 <- nc.sids$NWBIR74 / nc.sids$BIR74
nc.adj <- poly2nb(nc.sids)
B.nc <- nb2mat(nc.adj, style = "B")
nc.sids$CNTY_ID2 <- 1:(length(nc.sids$CNTY_ID))
}
## Reading layer `sids' from data source
## `/Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/library/spData/shapes/sids.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 100 features and 22 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -84.32385 ymin: 33.88199 xmax: -75.45698 ymax: 36.58965
## CRS: NA
result1_sids <- inla(SID74 ~ nwp_hat74, data = nc.sids,
family = "poisson",
control.compute = list(dic = TRUE, waic = TRUE,
cpo = TRUE, return.marginals = FALSE, control.gcpo = list(enable = TRUE,num.level.sets = 4)),
control.predictor = list(compute = TRUE),
E = E74)
result2_sids <- inla(SID74 ~ nwp_hat74 + f(CNTY_ID2, model = "iid"),
data = nc.sids,
family = "poisson",
control.compute = list(dic = TRUE, waic = TRUE,
cpo = TRUE, return.marginals = FALSE, control.gcpo = list(enable = TRUE,num.level.sets = 4)),
control.predictor = list(compute = TRUE),
E = E74)
result3_sids <- inla(SID74 ~ nwp_hat74 + f(CNTY_ID2, model = "besagproper", graph = B.nc),
data = as.data.frame(nc.sids),
family = "poisson",
control.compute = list(dic = TRUE, waic = TRUE,
cpo = TRUE, return.marginals = FALSE, control.gcpo = list(enable = TRUE,num.level.sets = 4)),
control.predictor = list(compute = TRUE),
E = E74)
result4_sids <- inla(SID74 ~ nwp_hat74 + f(CNTY_ID2, model = "bym2", graph = B.nc),
data = as.data.frame(nc.sids),
family = "poisson",
control.compute = list(dic = TRUE, waic = TRUE,
po = TRUE, return.marginals = FALSE, control.gcpo = list(enable = TRUE,num.level.sets = 4)),
control.predictor = list(compute = TRUE),
E = E74)
result1_sids$gcpo$groups[[1]]
## $idx
## [1] 1 19 38 78
##
## $corr
## [1] 1.0000000 0.9999920 0.9999960 0.9999973
result2_sids$gcpo$groups[[1]]
## $idx
## [1] 1 2 19 22 32 35 38 78 90
##
## $corr
## [1] 1.0000000 0.1638546 0.1638546 0.1693718 0.1693718 0.1638546 0.1663810
## [8] 0.1663810 0.1693718
result3_sids$gcpo$groups[[1]]
## $idx
## [1] 1 2 18 19
##
## $corr
## [1] 1.0000000 0.3897332 0.3726621 0.3678839
result4_sids$gcpo$groups[[1]]
## $idx
## [1] 1 2 18 19
##
## $corr
## [1] 1.0000000 0.2199047 0.1734146 0.2022057
#Plot the "model-based neighbours" for LGOCV
nc.sids$res1_n <- NA
nc.sids$res2_n <- NA
nc.sids$res3_n <- NA
nc.sids$res4_n <- NA
nc.sids$res1_n[1] <- 1
nc.sids$res2_n[1] <- 1
nc.sids$res3_n[1] <- 1
nc.sids$res4_n[1] <- 1
nc.sids$res1_n[result1_sids$gcpo$groups[[1]]$idx[-1]] <- 2
nc.sids$res2_n[result2_sids$gcpo$groups[[1]]$idx[-1]] <- 2
nc.sids$res3_n[result3_sids$gcpo$groups[[1]]$idx[-1]] <- 2
nc.sids$res4_n[result4_sids$gcpo$groups[[1]]$idx[-1]] <- 2
plot(nc.sids["res1_n"], main = "Model-based 1 neighbours for LGOCV")
plot(nc.sids["res2_n"], main = "Model-based 2 neighbours for LGOCV")
plot(nc.sids["res3_n"], main = "Model-based 3 neighbours for LGOCV")
plot(nc.sids["res4_n"], main = "Model-based 4 neighbours for LGOCV")
#Compute the log-score
LS_1 <- -mean(log(result1_sids$gcpo$gcpo))
LS_2 <- -mean(log(result2_sids$gcpo$gcpo))
LS_3 <- -mean(log(result3_sids$gcpo$gcpo))
LS_4 <- -mean(log(result4_sids$gcpo$gcpo))
c(LS_1, LS_2, LS_3, LS_4)
## [1] 2.218793 2.182051 2.184446 2.170247
The details are available at https://journals.sagepub.com/doi/10.1177/09622802241244613
(arXiv version https://arxiv.org/abs/2306.17236).
The joint density function of \(\pmb
x\) with precision parameters \(\tau_1,
\tau_2, ...,\tau_P\) is defined as \[\begin{equation}
\pi(\pmb x|\tau_{1}, \ldots, \tau_{P}) \propto
\exp\Big(-\dfrac{1}{4} \sum_
{\substack{i\text{ in sub-region } k \\ j\text{ in
sub-region } l \\ i \sim j \\ i > j }} (\tau_{l} + \tau_{k} )(x_i -
x_{j})^2 \Big), \quad k, l = 1, \ldots, P,
\label{eq::besagtype1}
\end{equation}\] with conditional densities \[\begin{equation*}
x_i |\pmb x_{-i}, \tau_{1}, \ldots, \tau_{P} \sim N
\Big(\dfrac{1}{2}\displaystyle \sum_{\substack{i\text{ in sub-region } k
\\ j\text{ in sub-region } l \\ i \sim j}} (\tau_{l} +
\tau_{k})\tau_{x_i}^{-1} x_j, \tau_{x_i}^{-1}\Big),
\end{equation*}\] and \[\begin{equation*}
\tau_{x_i} = \dfrac{1}{2}\Big(n_{i} \tau_{k} + \sum_{l} n_{il}
\tau_{l}\Big).
\end{equation*}\]
The joint PC prior for \(\pmb\theta = \log\pmb\tau\) can be derived as a convolution of the PC prior for \(\tau\) from the Besag model and an i.i.d. prior for the elements of \(\pmb\gamma\) such that \(\gamma_j\sim N(0, \sigma^2_\gamma)\), as follows \[\begin{equation} \pi(\pmb \theta) = 2^{-(P + 2)/2} \pi^{-P/2} \lambda \sigma^{-P} \exp \Big(-\frac{1}{2} (\pmb \theta- \pmb 1 \overline{\theta})^T \tilde{\pmb \Sigma}^{-1} (\pmb \theta- \overline{\theta} \pmb 1) - \overline{\theta}/2 - \lambda e^{-\overline{\theta}/2}\Big), \label{Jointprior} \end{equation}\]
We analyze the effects of hydrometeorological hazards on dengue risk in Brazil. To test the spatial variations in the spread of the virus in different sub-regions of Brazil, we fit dengue counts with a Poisson regression model as follows, \[\begin{equation} \pmb y \sim \text{Poisson}(E e^{\pmb \eta}), \quad \pmb \eta = \pmb 1^T \mu + \pmb \alpha \label{eq:brazildengue} \end{equation}\] where \(\pmb y\) is the observed counts in November of dengue cases, \(E\) is the expected number of counts , \(\pmb \eta\) is the linear predictor, \(\mu\) is the overall intercept, and \(\pmb \alpha\) is the Besag (model 0) or flexible Besag model over space.
load("Data/brazil.RData")
ggplot(map_df) +
theme_bw() +
geom_sf(fill = NA)
The counts are mostly low although some areas have large counts.
ggplot(data) +
geom_histogram(mapping = aes(Y), bins = 50, color = "black", fill = "grey") + xlim(0,1000) + ylim(0,500) +
labs(fill = "") +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
panel.border = element_blank(),
axis.line = element_line(colour = "black")) +
labs(x="dengue cases",
y="count")
## Warning: Removed 60 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_bar()`).
values <- numeric(558)
for(i in 1:558) {
values[i] <- mean(data$Y[data$S1==i]/exp(data$E[data$S1==i]))}
values[values<0.5] = "0.5"
values[values>0.5 & values<1] = "1"
values[values>1 & values<2] = "2"
values[values>2& values<5] = "3"
values[values>5] = "4"
custom_palette <- c("#f7776d", "#ffc000", "#2b8ab1", "#bbcf33", "#e76cf2")
ggplot(map_df) +
geom_sf(aes(fill = values), lwd = 0) +
labs(fill = "") +
theme_void() +
scale_fill_manual(
values = custom_palette,
breaks = unique(values),
labels = c("<0.5", "0.5-1", "1-2", "2-5", "5+")) +
theme(legend.position = "bottom",
legend.title = element_blank(),
legend.text = element_text(size=14))
#Plot of neighbors for municipalities 1 and 21
neigh_graphs <- poly2nb(as(map_df$geometry, "Spatial"), queen = FALSE)
col_lines <- rep(NA, length(neigh_graphs))
col_lines[1] = "blue"
col_lines[21] = "blue"
plot(map_df$geometry, border = "grey")
plot(neigh_graphs, coords = map_df$geometry, col = col_lines, add = T, cex = 0.1)
## Warning in st_point_on_surface.sfc(coords): st_point_on_surface may not give
## correct results for longitude/latitude data
Now the question arises, how do we find groupings?
ggplot(map_df) +
geom_sf(aes(fill = biome_name), lwd = 0) +
labs(fill = "")
ggplot(map_df) +
geom_sf(aes(fill = region_name), lwd = 0) +
labs(fill = "")
We can formulate the flexible Besag model next.
#Function of fbesag - usually in the fbesag library
if (T){
'inla.rgeneric.fbesag.model' <- function(
cmd = c("graph", "Q", "mu", "initial", "log.norm.const",
"log.prior", "quit"),
theta = NULL) {
sbesag <- function(R, prec, id_p, scaled_cnst, npart){
get_ij <- function(g){
ii <- c(); jj <- c()
for(i in 1:g$n){
ii <- c(ii,rep(i,g$nnbs[i]), i)
jj <- c(jj,g$nbs[[i]], i)
}
return(list(i=ii,j=jj))
}
g <- inla.read.graph(R)
x_scaled = c()
for(i in 1:g$n){
tmp <- -0.5*prec[c(id_p[g$nbs[[i]]])]
mas_prec <- prec[id_p[i]]
x_scaled <- c(x_scaled, tmp - 0.5*mas_prec, -sum(tmp) + 0.5*g$nnbs[i]*mas_prec + 1e-5)
}
r <- get_ij(g)
return(sparseMatrix(i = r$i, j = r$j, x = scaled_cnst*x_scaled))
}
npart <- length(unique(id_p))
dim_theta <- npart
interpret.theta <- function() {
res <- list()
for(i in 1:dim_theta){
res[[i]] = exp(theta[i])
}
return(res)
}
graph <- function(){
require(Matrix)
return(Matrix(W,sparse=TRUE))
}
Q <- function() {
res <- interpret.theta()
prec = c()
for(i in 1:dim_theta){
prec = c(prec, res[[i]])
}
myR <- sbesag(W, prec, id_p, scaled_cnst=scaled_cnst, npart)
return(inla.as.sparse(myR))
}
mu <- function(){return(numeric(0))}
log.norm.const <- function() {
return (numeric(0))
}
log.prior <- function() {
#p1 = 1; p2 = 1e-5
p1 = 0.5; p2 = 0.01
lam <- - log(p2)/p1
res <- interpret.theta()
prec = c()
for(i in 1:dim_theta){
prec = c(prec, res[[i]])
}
theta_p = log(prec)
sigm2 <- sd_sim^2
mean_theta <- mean(theta_p)
P = npart
e = eigen(diag(P) - (1/P)*matrix(1,P,P))
D = diag(c(1.0/e$values[1:(P-1)]))
inv_tilda_Sigma = (1/sigm2)*e$vectors[,1:(P-1)]%*%D%*%t(e$vectors[,1:(P-1)])
res1 <- log(lam) - (lam)*exp(-0.5*mean_theta) -0.5*mean_theta
res2 <- -0.5*(theta_p-mean_theta)%*%inv_tilda_Sigma%*%(theta_p-mean_theta)
res <- drop(res1) + drop(res2)
return(res)
}
initial <- function() {
#return(initial_theta)
return(rep(4,npart))
}
quit <- function() {
return(invisible())
}
res <- do.call(match.arg(cmd), args = list())
return(res)
}
}
#Model setup - usually in the fbesag library
constr.inter <- list(A = matrix(1,1,dim(graph)[1]), e = rep(0, 1))
scaled_graph = as.matrix(INLA:::inla.scale.model(graph,constr.inter))
scaled_cnst = scaled_graph[1,1]/graph[1,1]
Six_terrestrial_biomes = T
id_regions <- c()
if(Six_terrestrial_biomes){
id_regions <- data$S4
id_regions[which(id_regions==3)] = 2
id_regions[which(id_regions==4)] = 3
id_regions[which(id_regions==5)] = 4
id_regions[which(id_regions==6)] = 5
}else{
id_regions <- data$S3
}
#PC prior setup
precision.prior <- list(prec = list(prior = "pc.prec", param = c(0.5, 0.01)))
#Define the matrices according to fbesag
fbesag.model <- inla.rgeneric.define(inla.rgeneric.fbesag.model, W = graph, id_p = id_regions ,scaled_cnst = scaled_cnst, sd_sim = 0.15)
config = FALSE
baseformula <- Y ~ 1 + f(S1,model="generic0", Cmatrix= scaled_graph, constr= TRUE, rankdef = 1,hyper = precision.prior) +
f(T1, model = "rw1", constr = TRUE, scale.model = TRUE, hyper = list(prec = list(prior = "pc.prec", param = c(0.5, 0.01))))
formula <- Y ~ 1 + f(S1, model = fbesag.model, constr= TRUE, rankdef=1) +
f(T1, model = "rw1", constr = TRUE, scale.model = TRUE, hyper = precision.prior)
#For computation time we use int.strategy = "eb"
#Usual besag model
model_naive <- inla(formula = baseformula, data = data, family = "poisson", offset = log(E),
control.inla = list(strategy = 'gaussian', int.strategy = "eb"),
control.compute = list(dic = TRUE, config = config,
cpo = TRUE, return.marginals = FALSE, control.gcpo = list(enable = TRUE, num.level.sets = 2)),
control.fixed = list(correlation.matrix = TRUE,
prec.intercept = 1, prec = 1),
control.predictor = list(link = 1, compute = TRUE))
#fbesag model
model_fbesag <- inla(formula = formula, data = data, family = "poisson", offset = log(E),
control.inla = list(strategy = 'gaussian', int.strategy = "eb"),
control.compute = list(dic = TRUE, config = config,
cpo = TRUE, return.marginals = FALSE, control.gcpo = list(enable = TRUE, num.level.sets = 2)),
control.fixed = list(correlation.matrix = TRUE,
prec.intercept = 1, prec = 1),
control.predictor = list(link = 1, compute = TRUE))
The results are as follows:
results <- data.frame(
Row = c("stationary", "non-stationary", "Better?"),
DIC = c(0, 0, 0),
CPO = c(0, 0, 0),
GCPO = c(0, 0, 0),
logML = c(0, 0, 0)
)
#DIC
results$DIC[1] <- round(model_naive$dic$dic,0)
results$DIC[2] <- round(model_fbesag$dic$dic,0)
results$DIC[3] <- round(model_fbesag$dic$dic,0) < round(model_naive$dic$dic,0)
#logML
results$logML[1] <- model_naive$mlik[1]
results$logML[2] <- model_fbesag$mlik[1]
results$logML[3] <- model_fbesag$mlik[1] > model_naive$mlik[1]
#gcpo
results$GCPO[1] <- -mean(log(model_naive$gcpo$gcpo[!is.na(model_naive$gcpo$gcpo)]))
results$GCPO[2] <- -mean(log(model_fbesag$gcpo$gcpo[!is.na(model_naive$gcpo$gcpo)]))
results$GCPO[3] <- -mean(log(model_fbesag$gcpo$gcpo[!is.na(model_naive$gcpo$gcpo)])) < -mean(log(model_naive$gcpo$gcpo[!is.na(model_naive$gcpo$gcpo)]))
results$CPO[1] <- -mean(log(model_naive$cpo$cpo[!is.na(model_naive$cpo$cpo)]))
results$CPO[2] <- -mean(log(model_fbesag$cpo$cpo[!is.na(model_naive$cpo$cpo)]))
results$CPO[3] <- -mean(log(model_fbesag$cpo$cpo[!is.na(model_naive$cpo$cpo)])) < -mean(log(model_naive$cpo$cpo[!is.na(model_naive$cpo$cpo)]))
ch <- id_regions[1:558]
means <- numeric(5)
for(i in 1:5){
means[i] <- mean(abs(model_fbesag$summary.random$S1$mean[ch==i] - model_naive$summary.random$S1$mean[ch==i]))
}
f1_mean <- model_naive$internal.summary.hyperpar$mean[1]
f2_mean <- model_fbesag$internal.summary.hyperpar$mean[1:5]
f1_sd <- model_naive$internal.summary.hyperpar$sd[1]
f2_sd <- model_fbesag$internal.summary.hyperpar$sd[1:5]
print(means)
## [1] 0.003242781 0.002327383 0.032996773 0.003975427 0.008705653
print(results)
## Row DIC CPO GCPO logML
## 1 stationary -61841 Inf 16.55130 -118857.7
## 2 non-stationary -61943 Inf 16.55109 -118618.6
## 3 Better? 1 0 1.00000 1.0
f1_mean - f2_mean
## [1] -0.24000436 0.02893500 -0.10633755 -0.04253707 0.09863401
f1_sd - f2_sd
## [1] -0.04908586 -0.04879767 -0.11355986 -0.06704619 -0.02684723
Let’s visualize some of the results.
#Estimated tau values
values <- numeric(558)
diff_m1 <- model_fbesag$internal.summary.hyperpar$mean[1:5]
for(i in 1:558) values[i] <- mean(diff_m1[id_regions[neigh_graphs[[i]]]])
ggplot(map_df) +
geom_sf(aes(fill = values), lwd = 0) +
scale_fill_gradient_tableau("Blue-Teal") +
labs(fill = expression(tau[i])) +
theme_void()
#Estimated differences of the tau's
values <- numeric(558)
diff_m1 <- c(model_fbesag$internal.summary.hyperpar$mean[1:5]) - c(model_naive$internal.summary.hyperpar$mean[1])
for(i in 1:558) values[i] <- mean(diff_m1[id_regions[neigh_graphs[[i]]]])
ggplot(map_df) +
geom_sf(aes(fill = values), lwd = 0) +
scale_fill_gradient2(low = "#49B8F1",
mid = "white",
high = "red") +
labs(fill = "") +
theme_void()
#Estimated differences of the SD's of tau
values <- numeric(558)
diff_m1 <- c(model_fbesag$internal.summary.hyperpar$sd[1:5]) - c(model_naive$internal.summary.hyperpar$sd[1])
for(i in 1:558) values[i] <- mean(diff_m1[id_regions[neigh_graphs[[i]]]])
ggplot(map_df) +
geom_sf(aes(fill = values), lwd = 0) +
scale_fill_gradient2(low = "#49B8F1",
mid = "white",
high = "red") +
labs(fill = "") +
theme_void()
We can also look at the posterior mean of the random effect from the two
models.
#Fitted values
values <- numeric(558)
values <- model_naive$summary.random$S1$mean
ggplot(map_df) +
geom_sf(aes(fill = values), lwd = 0.1) +
scale_fill_gradient2(low = "#49B8F1",
mid = "white",
high = "red") +
labs(fill = "") +
ggtitle("Posterior mean of x - Besag") +
theme_void()
values <- numeric(558)
values <- model_fbesag$summary.random$S1$mean
ggplot(map_df) +
geom_sf(aes(fill = values), lwd = 0.1) +
scale_fill_gradient2(low = "#49B8F1",
mid = "white",
high = "red") +
labs(fill = "") +
ggtitle("Posterior mean of x - fBesag") +
theme_void()
We can also look at the difference between the random effects estimated
from the stationary and non-stationary model.
#Difference in x between the two models
values <- numeric(558)
values <- c(model_fbesag$summary.random$S1$mean) - c(model_naive$summary.random$S1$mean)
ggplot(map_df) +
geom_sf(aes(fill = values), lwd = 0.1) +
scale_fill_gradient2(low = "#49B8F1",
mid = "white",
high = "red") +
labs(fill = "") +
ggtitle("Difference in the posterior mean of x") +
theme_void()
#97.5quantile
values <- numeric(558)
values <- c(model_fbesag$summary.random$S1$'0.975quant') - c(model_naive$summary.random$S1$'0.975quant')
ggplot(map_df) +
geom_sf(aes(fill = values), lwd = 0.1) +
scale_fill_gradient2(low = "#49B8F1",
mid = "white",
high = "red") +
labs(fill = "") +
ggtitle("Difference in 97.5th percentile of x") +
theme_void()
#2.5quantile
values <- numeric(558)
values <- c(model_fbesag$summary.random$S1$'0.025quant') - c(model_naive$summary.random$S1$'0.025quant')
ggplot(map_df) +
geom_sf(aes(fill = values), lwd = 0.1) +
scale_fill_gradient2(low = "#49B8F1",
mid = "white",
high = "red") +
ggtitle("Difference in 2.5th percentile of x") +
labs(fill = "") +
theme_void()